#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <simd.h>
#include <utility>
template<typename T>
long rpcc(T &waitval){
  long ret;
  asm ("rcsr %0, 4\n\t" : "=r"(ret), "+r"(waitval)::"memory");
  return ret;
}
union longv4u {
  long sval[4];
  int256 vval;
};
union doublev4u{
  double sval[4];
  doublev4 vval;
};
__attribute__((section(".ldm"))) const unsigned long MAGIC_NUM = 0x5fe6eb50c7aa19f9L;
__attribute__((section(".ldm"))) const double f1_5 = 1.5;
__attribute__((section(".ldm"))) const longv4u MAGICV4 = {.sval = {0x5fe6eb50c7aa19f9L, 0x5fe6eb50c7aa19f9L, 0x5fe6eb50c7aa19f9L, 0x5fe6eb50c7aa19f9L}};
__attribute__((section(".ldm"))) const doublev4u F1_5v4 = {.sval = {1.5, 1.5, 1.5, 1.5}};
__attribute__((section(".ldm"))) const doublev4u f0_5v4 = {.sval = {0.5, 0.5, 0.5, 0.5}};
__always_inline double invsqrt_almost_asm(double x) {
  double t1 = f1_5, x2, y, f15, t2;
  long t0 = MAGIC_NUM;
  asm volatile("fmsd %[X], %[T1], %[X], %[X2]\n\t"
               "srl  %[X], 1, %[Y]\n\t"
               "subl %[T0], %[Y], %[Y]\n\t"
               "fmuld %[Y], %[T1], %[X]\n\t"
               "fmuld %[Y], %[Y], %[T0]\n\t"
               "fmuld %[Y], %[X2], %[T2]\n\t"
               "fnmad %[T2], %[T0], %[X], %[Y]\n\t"
               "fmuld %[Y], %[T1], %[X]\n\t"
               "fmuld %[Y], %[Y], %[T0]\n\t"
               "fmuld %[Y], %[X2], %[T2]\n\t"
               "fnmad %[T2], %[T0], %[X], %[Y]\n\t"
               "fmuld %[Y], %[T1], %[X]\n\t"
               "fmuld %[Y], %[Y], %[T0]\n\t"
               "fmuld %[Y], %[X2], %[T2]\n\t"
               "fnmad %[T2], %[T0], %[X], %[Y]\n\t"
               : [Y] "=&r"(y), [T0] "+&r"(t0), [T1] "+&r"(t1), [X2] "=&r"(x2), [X]"+&r"(x), [T2] "=r"(t2));
  return y;
}
__always_inline doublev4 invsqrt_almost_asm_v4(doublev4 x) {
  doublev4 t1, x2, y, f15, t2;
  int256 t0 = MAGICV4.vval;
  asm volatile("srlow %[X], 1, %[Y]\n\t"
               "ldih %[T1], 0x3ff8($31)\n\t"
               "sll  %[T1], 32, %[T1]\n\t"
               "vshff %[T1], %[T1], 0, %[T1]\n\t"
               "vmsd %[X], %[T1], %[X], %[X2]\n\t"
               "vcpys $31, %[Y], %[Y]\n\t"
               "vsubl %[T0], %[Y], %[Y]\n\t"
               "vmuld %[Y], %[T1], %[X]\n\t"
               "vmuld %[Y], %[Y], %[T0]\n\t"
               "vmuld %[Y], %[X2], %[T2]\n\t"
               "vnmad %[T2], %[T0], %[X], %[Y]\n\t"
               "vmuld %[Y], %[T1], %[X]\n\t"
               "vmuld %[Y], %[Y], %[T0]\n\t"
               "vmuld %[Y], %[X2], %[T2]\n\t"
               "vnmad %[T2], %[T0], %[X], %[Y]\n\t"
               "vmuld %[Y], %[T1], %[X]\n\t"
               "vmuld %[Y], %[Y], %[T0]\n\t"
               "vmuld %[Y], %[X2], %[T2]\n\t"
               "vnmad %[T2], %[T0], %[X], %[Y]\n\t"
               : [Y] "=&r"(y), [T0] "+&r"(t0), [T1] "+&r"(t1), [X2] "=&r"(x2), [X]"+&r"(x), [T2] "=r"(t2));
  return y;
}
__always_inline double invsqrt_purec(double x) {
  double x2 = x - x * f1_5;
  union {
    unsigned long l;
    double d;
  } tmp;
  tmp.d = x;
  tmp.l = MAGIC_NUM - (tmp.l >> 1);
  double y = tmp.d;

  y = __builtin_fma(y*y, x2*y, f1_5*y);
  y = __builtin_fma(y*y, x2*y, f1_5*y);
  y = __builtin_fma(y*y, x2*y, f1_5*y);

  return y;
}
template<typename T>
T tfma(T a, T b, T c){
  return a * b + c;
}

template<typename T>
__always_inline T touch(T &ref){
  asm volatile("" : "+r"(ref));
  return ref;
}

//#define touch(x) x
__always_inline doublev4 invsqrt_purecv4(doublev4 x) {
  doublev4 f1_5v4;// = __builtin_sw_vcpyfd(1.5);
  
  asm("vshff %1, %1, 0, %0\n\t" :"=r"(f1_5v4) : "r"(f1_5));
  doublev4 x2 = x - x * f1_5v4;
  union {
    int256 l;
    doublev4 d;
  } tmp, magic;
  asm("vshff %1, %1, 0, %0\n\t" :"=r"(magic.l) : "r"(MAGIC_NUM));
  tmp.d = x;
  tmp.l >>= 1;
  tmp.d = __builtin_sw_vcpysd(f1_5v4, tmp.d);
  //printf("%lx\n", *(long*)&tmp.l);
  tmp.l = magic.l - tmp.l;
  
  //printf("%f\n", *(double*)&tmp.d);
  doublev4 y = tmp.d;
  doublev4 y15;
  y15 = f1_5v4 * y;
  y = tfma(y*y, x2*y, touch(y15));
  y15 = f1_5v4 * y;
  y = tfma(y*y, x2*y, touch(y15));
  y15 = f1_5v4 * y;
  y = tfma(y*y, x2*y, touch(y15));

  return y;
}
__attribute__((noinline)) long test(double f){
  doublev4 fv4;
  asm("vshff %1, %1, 0, %0\n\t" : "=r"(fv4) : "r"(f));
  double r = invsqrt_almost_asm(f);
  volatile long st = rpcc(fv4);
  doublev4 rv4 = invsqrt_almost_asm_v4(fv4);
  //doublev4 rv4 = invsqrt_purecv4(fv4);
  //r = invsqrt_purec(f);
  volatile long ed = rpcc(rv4);
  printf("%f %ld\n", *((double*)&rv4 + 1), ed - st);
  return ed - st;

}
int main(int argc, char **argv){
  double f = atof(argv[1]);
  printf("%ld\n", test(f));
}
